Load libraries

#load libraries
library(tidyverse) 
library(quanteda) #for text cleaning
library(igraph) #for creating graphs
library(visNetwork) #for visualizing graphs
library(wordcloud) #for creating wordclouds

Load function scripts and data

#load_functions
source("calculatecoocstats.R") #calculate co-occurrence statistics
source("grapher.R") #create graph
#Wiedemann, Gregor; Niekler, Andreas (2017): Hands-on: A five day text mining course for humanists and social scientists in R. Proceedings of the 1st Workshop on Teaching NLP for Digital Humanities (Teach4DH@GSCL 2017), Berlin.
source("rawcounts.R") #find raw counts of co-occurrences
source("token_filter.R") #filter tokens

Load token data

#load tokens, get it ready for analysis
load("token.all.RData")
#convert tokens to all lower
token.all <- tokens_tolower(token.all) #convert all tokens to lower
#sample based on min in a decade
token.all = tokens_sample(token.all, size = 22638, replace = FALSE, prob = NULL, by = decade)

Find Probability of Verbs/Adj/Noun given male/female across decades

#create a token set with only generalized pos info
pos_replace <- function(toks.replace){
  toks.replace <- toks.replace %>% 
    tokens_replace(pattern = c("*/NOUN", "*/VERB", "*/ADJ"), replacement = c("NOUN", "VERB", "ADJ"))
  return(toks.replace)
}
token.pos <- pos_replace(token.all) 

p_decdat <- data.frame() #initialize data frame
pos = c('verb', 'adj', 'noun') #pos to be analysed

for(j in 0:7){ #for loop to run for each decade
  year = 1940 + j*10 #create decade variable
  pos_counts <- rawcounts(token_filter("all", year, token.pos)) #find raw co-occurrence counts
  male_pos <- pos_counts["male/characters", pos] #filter pos
  male_p <- male_pos / sum(male_pos) #find empirical probability
  male_pdat <- data.frame(pos = names(male_pos), p = male_p) #organise data frame
  male_pdat$gender = "male" #assign gender
  
  #do the same for females
  female_pos <- pos_counts["female/characters", pos]
  female_p <- female_pos / sum(female_pos)
  female_pdat <- data.frame(pos = names(female_pos), p = female_p)
  female_pdat$gender = "female"
  
  p_decdat.temp <- rbind(male_pdat, female_pdat) #bind gender data
  p_decdat.temp$year <- year #assign year
  p_decdat <- rbind(p_decdat, p_decdat.temp) #bind ind. decade with overall
}

Adjectives

##             Df    Sum Sq   Mean Sq F value Pr(>F)
## year         1 0.0001165 1.165e-04   2.296  0.156
## gender       1 0.0000094 9.370e-06   0.185  0.675
## year:gender  1 0.0000066 6.620e-06   0.130  0.724
## Residuals   12 0.0006091 5.076e-05

Verbs

##             Df    Sum Sq   Mean Sq F value Pr(>F)
## year         1 4.602e-05 4.602e-05   2.372  0.149
## gender       1 4.000e-07 4.000e-07   0.021  0.888
## year:gender  1 2.500e-07 2.500e-07   0.013  0.911
## Residuals   12 2.328e-04 1.940e-05

Nouns

## Saving 7 x 5 in image
##             Df    Sum Sq   Mean Sq F value Pr(>F)
## year         1 1.608e-05 1.608e-05   2.039  0.179
## gender       1 1.364e-05 1.364e-05   1.729  0.213
## year:gender  1 4.300e-06 4.297e-06   0.545  0.475
## Residuals   12 9.461e-05 7.885e-06

Community analysis

1940 - 2020

Load and prepare data
load("token.all.RData")
#convert tokens to all lower
token.all <- tokens_tolower(token.all) #convert all tokens to lower
token.all = tokens_sample(token.all, size = 22638, replace = FALSE, prob = NULL, by = decade)
#token.all <- token_filter2('all', 2000, 2010, token.all)
Function to detect and plor community structure
detect_communities <- function(toks.all, gender = 'male', nn = 10){
  toks.all = token_filter2('all', 2000, 2010, toks.all)
  toks <- toks.all %>% 
     tokens_select(pattern = paste(gender, '/characters', sep = ''), selection = 'remove', padding = TRUE, window = 5)

  #filter to keep only words that occur at least 10 times
dfm <-  toks %>% dfm() %>% dfm_trim(min_termfreq = 10)
filtered = colnames(dfm)
toks <- token.all %>% 
  tokens_select(pattern = filtered, selection = 'keep', padding = TRUE)

#feature co-occurrence matrix for males
fcmat = fcm(toks, context = c("window"),
                 count = c("weighted"), #words are weighted within the window
                 window = 5)

graph = graph_from_adjacency_matrix(fcmat, weighted = TRUE) #create graph from matrix
edgelist <- get.data.frame(graph)
edgelist_m <- as.matrix(edgelist[ ,c("from", "to")])

graph <- graph_from_edgelist(edgelist_m, directed = FALSE) 
graph <- set.edge.attribute(graph, "weight", value = edgelist$weight)
graph = simplify(graph, remove.loops = TRUE) #remove self-looping edges

#louvian communities
louvain <- cluster_louvain(graph, weights = E(graph)$weights)#detect communities 
graph$community <- louvain$membership

#most important word in each community
communities <- data.frame()

for (i in unique(graph$community)) {
  # create subgraphs for each community
  subgraph <- induced_subgraph(graph, v = which(graph$community == i))
  # get size of each subgraph
  size <- igraph::gorder(subgraph)
  # get betweenness centrality
  btwn <-  igraph::betweenness(subgraph)
  communities <- communities %>%
    dplyr::bind_rows(
      data.frame(community = i,
                 n_characters = size,
                 most_important = names(which(btwn == max(btwn)))
      )
    )
}

communities = arrange(communities, desc(n_characters))
top_comm <- communities$community[1:5]
print(communities)

#top ten in each community
top_ten <- data.frame()
n = 0
for (i in top_comm) {
  # create subgraphs for each community
  subgraph <- induced_subgraph(graph, v = which(graph$community == i))
    n = n + 1
    # get degree
    degree <-  igraph::degree(subgraph)
    # get top ten degrees
    top <- names(head(sort(degree, decreasing = TRUE), nn))
    result <- data.frame(community = i, rank = 1:nn, word = top)
    top_ten <- top_ten %>% 
    dplyr::bind_rows(result)
}

print(top_ten)
#write.csv(top_ten, paste(gender, '.csv', sep = ''))
print(paste('modularity =', modularity(louvain)))

#Visualizing the communities
subgraph <- induced_subgraph(graph, v = top_ten$word)
subgraph <- simplify(subgraph)
subgraph$community
nodes = data.frame(word = names(V(subgraph)))
group = rep(1:n, each = nn)
top_ten$group = group
clusters = inner_join(nodes, top_ten)
subgraph$community <- clusters$group
#unique(subgraph$community)

# give our nodes some properties, incl scaling them by degree and coloring them by community
V(subgraph)$size <- 5
V(subgraph)$frame.color <- "white"
V(subgraph)$color <- subgraph$community
#V(male_subgraph)$label <- V(male_subgraph)$name
V(subgraph)$label.cex <- 1.8

# also color edges according to their starting node
#edge.start <- ends(subgraph, es = E(subgraph), names = F)[,1]
#E(subgraph)$color <- V(subgraph)$color[edge.start]
#E(subgraph)$arrow.mode <- 0

#plot by groups
#make clusters first
clust_obj = make_clusters(subgraph, membership = clusters$group)

# weights <- ifelse(crossing(male_clust, male_subgraph), 1, 100)
# layout <- layout_with_kk(male_subgraph, weights=weights)
# plot(male_subgraph, layout=layout)

prettyColors <- c("turquoise4", "azure4", "olivedrab","deeppink4", "blue")
communityColors <- prettyColors[membership(clust_obj)]

edge.weights <- function(community, network, weight.within = 100, weight.between = 1) {
  bridges <- crossing(communities = community, graph = network)
  weights <- ifelse(test = bridges, yes = weight.between, no = weight.within)
  return(weights) 
}
E(subgraph)$weight <- edge.weights(clust_obj, subgraph)
layout <- layout_with_fr(subgraph, weights=E(subgraph)$weight)
plot(subgraph, layout=layout, col = communityColors)
}
Male communities
detect_communities(token.all, 'male', 10)
##   community n_characters most_important
## 1         2          539    battle/noun
## 2         5          383       many/adj
## 3         6          356       true/adj
## 4         3          346    starts/verb
## 5         4           89   suicide/noun
## 6         1           87    travel/noun
##    community rank              word
## 1          2    1        takes/verb
## 2          2    2          car/noun
## 3          2    3        death/noun
## 4          2    4       killed/verb
## 5          2    5         body/noun
## 6          2    6         room/noun
## 7          2    7          dead/adj
## 8          2    8      killing/verb
## 9          2    9        kills/verb
## 10         2   10        using/verb
## 11         5    1         other/adj
## 12         5    2         have/verb
## 13         5    3         take/verb
## 14         5    4         find/verb
## 15         5    5          men/noun
## 16         5    6           go/verb
## 17         5    7          get/verb
## 18         5    8        leave/verb
## 19         5    9        money/noun
## 20         5   10         make/verb
## 21         6    1 female/characters
## 22         6    2       father/noun
## 23         6    3        tells/verb
## 24         6    4       mother/noun
## 25         6    5          has/verb
## 26         6    6        finds/verb
## 27         6    7       family/noun
## 28         6    8          man/noun
## 29         6    9        house/noun
## 30         6   10         wife/noun
## 31         3    1           new/adj
## 32         3    2         life/noun
## 33         3    3       begins/verb
## 34         3    4         film/noun
## 35         3    5           own/adj
## 36         3    6       become/verb
## 37         3    7       school/noun
## 38         3    8    including/verb
## 39         3    9        story/noun
## 40         3   10        world/noun
## 41         4    1       police/noun
## 42         4    2         local/adj
## 43         4    3        taken/verb
## 44         4    4       murder/noun
## 45         4    5      officer/noun
## 46         4    6         gets/verb
## 47         4    7     arrested/verb
## 48         4    8       arrive/verb
## 49         4    9       prison/noun
## 50         4   10         sent/verb
## [1] "modularity = 0.119871233523078"

Female communities
detect_communities(token.all, 'female', 10)
##   community n_characters most_important
## 1         3          734   killing/verb
## 2         6          673   arrives/verb
## 3         2          630      local/adj
## 4         4          506     night/noun
## 5         1          205     store/noun
## 6         5            2     serial/adj
## 7         5            2    killer/noun
##    community rank            word
## 1          3    1        car/noun
## 2          3    2      house/noun
## 3          3    3     killed/verb
## 4          3    4       room/noun
## 5          3    5       body/noun
## 6          3    6    killing/verb
## 7          3    7    causing/verb
## 8          3    8      kills/verb
## 9          3    9        dead/adj
## 10         3   10      falls/verb
## 11         6    1 male/characters
## 12         6    2         is/verb
## 13         6    3         be/verb
## 14         6    4     father/noun
## 15         6    5        has/verb
## 16         6    6        man/noun
## 17         6    7      tells/verb
## 18         6    8     family/noun
## 19         6    9       wife/noun
## 20         6   10     mother/noun
## 21         2    1       other/adj
## 22         2    2        men/noun
## 23         2    3       find/verb
## 24         2    4        get/verb
## 25         2    5     police/noun
## 26         2    6      group/noun
## 27         2    7       town/noun
## 28         2    8        are/verb
## 29         2    9       kill/verb
## 30         2   10       help/verb
## 31         4    1       time/noun
## 32         4    2         new/adj
## 33         4    3       life/noun
## 34         4    4       film/noun
## 35         4    5       make/verb
## 36         4    6     begins/verb
## 37         4    7       have/verb
## 38         4    8         own/adj
## 39         4    9     school/noun
## 40         4   10        way/noun
## 41         1    1      money/noun
## 42         1    2       take/verb
## 43         1    3       give/verb
## 44         1    4        job/noun
## 45         1    5        pay/verb
## 46         1    6     offers/verb
## 47         1    7        more/adj
## 48         1    8    company/noun
## 49         1    9      given/verb
## 50         1   10     giving/verb
## [1] "modularity = 0.092771822096592"

Modelling edge weight across decades

Action - Led/Verb
all_ind <- data.frame() #initialise
term <- "led/verb" #term to find PPMI for
pos <- "verb" #pos of word 
for(i in 0 : 7){ #for loop to run across decades
  j = 1940 + 10*i
  male_ind = grapher("male/characters", 10 ,token_filter(pos, j, token.all), "MI")[[3]][] #get PPMI data for given decade
  male_ind$rank = 1 : nrow(male_ind) #rank words - redundant
  male_ind <- male_ind %>% filter(names == term) #filter term given
  male_ind$year = j #attach year info
  male_ind$gender = "male" #assign gender
  
  #same for females
  j = 1940 + 10*i
  female_ind = grapher("female/characters", 10 ,token_filter(pos, j, token.all), "MI")[[3]][]
  female_ind$rank = 1 : nrow(female_ind)
  female_ind <- female_ind %>% filter(names == term)
  female_ind$year = j
  female_ind$gender = "female"

  #bind to overall data
  all_ind <- rbind(all_ind, male_ind, female_ind)
}

#plot 
ggplot(all_ind, aes(x = year, y = loglik, color = gender)) +
  geom_point(color = "black") + 
  geom_line(size = 1) +
  geom_smooth(method = "lm", se = TRUE, size = 1, aes(fill = gender), alpha = 0.1) + theme_minimal() +
  ylab("Pointwise Mutual Information") + ggtitle("Led/Verb") +
  theme(axis.text = element_text(color = "black", size = 12), axis.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 12), legend.title = element_text(color = "black", size = 14),
        panel.grid.major = element_line(colour = "grey50", size = 0.3), panel.grid.minor = element_line(colour = "grey50", size = 0.3))
## `geom_smooth()` using formula 'y ~ x'

ggsave("loves_verb.png", width = 6, height = 4)
## `geom_smooth()` using formula 'y ~ x'
#check significance
ancova.word <- lm(loglik~year*gender, data = all_ind)
summary(ancova.word)
## 
## Call:
## lm(formula = loglik ~ year * gender, data = all_ind)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.138808 -0.021391 -0.004403  0.027092  0.144828 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     -10.617836   2.515989  -4.220  0.00119 **
## year              0.005276   0.001274   4.142  0.00137 **
## gendermale       13.394465   3.558145   3.764  0.00270 **
## year:gendermale  -0.006576   0.001801  -3.650  0.00333 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08255 on 12 degrees of freedom
## Multiple R-squared:  0.9061, Adjusted R-squared:  0.8826 
## F-statistic: 38.61 on 3 and 12 DF,  p-value: 1.926e-06
anova(ancova.word)
## Analysis of Variance Table
## 
## Response: loglik
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## year         1 0.03320 0.03320   4.872  0.047513 *  
## gender       1 0.66530 0.66530  97.621 4.078e-07 ***
## year:gender  1 0.09080 0.09080  13.323  0.003326 ** 
## Residuals   12 0.08178 0.00682                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#check significance - females
all_ind_fem <- all_ind %>% filter(gender == "female")
ancova.word <- lm(loglik~year, data = all_ind_fem)
summary(ancova.word)
## 
## Call:
## lm(formula = loglik ~ year, data = all_ind_fem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13881 -0.04043 -0.01444  0.03569  0.14483 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -10.617836   3.081295  -3.446   0.0137 *
## year          0.005276   0.001560   3.382   0.0148 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1011 on 6 degrees of freedom
## Multiple R-squared:  0.6559, Adjusted R-squared:  0.5986 
## F-statistic: 11.44 on 1 and 6 DF,  p-value: 0.01482
anova(ancova.word) 
## Analysis of Variance Table
## 
## Response: loglik
##           Df  Sum Sq  Mean Sq F value  Pr(>F)  
## year       1 0.11691 0.116908  11.437 0.01482 *
## Residuals  6 0.06133 0.010222                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1